The packages; dplyr, plyr , rehsape, reshape2, ggplot were applied for analysis the crop heal data (H. Wickham and Francois 2015; H. Wickham 2011; Wickham and Hadley 2007)
#==============
library(dplyr) # arrange data structure
library(plyr)
library(reshape)
library(reshape2)
library(lubridate)
#================
library(ggplot2) # plotting
library(gridExtra)
library(scales)
library(cowplot)
#===============
library(bioDist) # Co-ocurrance analysis
library(vegan) # Co-ocurrance analysis
#==============
library(igraph) # Network analysis package
Survey data were stored at the google drive under the this link.
library(RCurl) # run this package for load the data form the website
file <- getURL("https://docs.google.com/spreadsheets/d/1zB7gNdI7Nk7SuHuPWcjzaKnjuwkvL6sOVMo0zMfuV-c/pub?gid=558862364&single=true&output=csv") # load data from the google drive
data <- read.csv(text = file) # read data which is formated as the csv
#======================================================================
data[data == "-"] <- NA # replace '-' with NA
data[data == ""] <- NA # replace 'missing data' with NA
#==== to lower variable names ====
names(data) <- tolower(names(data)) # for more consistancy
#======================================================================
# remove the variables that are not included for analysis
data$phase <- NULL # there is only one type yype of phase in the survey
data$identifier <- NULL # this variable is not included in the analysis
data$village <- NULL # remove name of village
data$year <- NULL # remove year data
data$season <- NULL # remove season data
data$lat <- NULL # remove latitude data
data$long <- NULL # remove longitude data
data$fa <- NULL # field area is not include in the analysis
data$fn <- NULL # farmer name can not be included in this survey analysis
data$fp <- NULL # I do not know what is fp
data$lfm <- NULL # there is only one type of land form in this survey
data$ced <- NULL # Date data can not be included in the network analysis
data$cedjul <- NULL # remove crop establisment julian date data
data$hd <- NULL # Date data can not be included in the network analysis
data$hdjul <- NULL # remove harvest julian date
data$cvr <- NULL # reove crop varieties
data$varcoded <- NULL # I will recode them
data$fymcoded <- NULL # remove code data of fym
data$mu <- NULL # no record of mullucicide data
data$nplsqm <- NULL # remove number of plant per square meter
data$rbpx <- NULL # no record of rice bug p
#======================================================================
#=================== corract the variable type ========================
#======================================================================
data <- transform(data,
country = as.factor(country),
pc = as.factor(pc),
cem = as.factor(cem),
ast = as.factor(ast),
ccd = as.numeric(ccd),
vartype = as.factor(vartype),
fym = as.character(fym),
n = as.numeric(n),
p = as.numeric(p) ,
k = as.numeric(k),
mf = as.numeric(mf),
wcp = as.factor(wcp),
iu = as.numeric(iu),
hu = as.numeric(hu),
fu = as.numeric(fu),
cs = as.factor(cs),
ldg = as.numeric(ldg),
yield = as.numeric(yield) ,
dscum = as.factor(dscum),
wecum = as.factor(wecum),
ntmax = as.numeric(ntmax),
npmax = as.numeric(npmax),
nltmax = as.numeric(nltmax),
nlhmax = as.numeric(nltmax),
waa = as.numeric(waa),
wba = as.numeric(wba) ,
dhx = as.numeric(dhx),
whx = as.numeric(whx),
ssx = as.numeric(ssx),
wma = as.numeric(wma),
lfa = as.numeric(lfa),
lma = as.numeric(lma),
rha = as.numeric(rha) ,
thrx = as.numeric(thrx),
pmx = as.numeric(pmx),
defa = as.numeric(defa),
bphx = as.numeric(bphx),
wbpx = as.numeric(wbpx),
awx = as.numeric(awx),
rbx =as.numeric(rbx),
rbbx = as.numeric(rbbx),
glhx = as.numeric(glhx),
stbx=as.numeric(stbx),
hbx= as.numeric(hbx),
bbx = as.numeric(bbx),
blba = as.numeric(blba),
lba = as.numeric(lba),
bsa = as.numeric(bsa),
blsa = as.numeric(blsa),
nbsa = as.numeric(nbsa),
rsa = as.numeric(rsa),
lsa = as.numeric(lsa),
shbx = as.numeric(shbx) ,
shrx = as.numeric(shrx),
srx= as.numeric(srx),
fsmx = as.numeric(fsmx),
nbx = as.numeric(nbx),
dpx = as.numeric(dpx),
rtdx = as.numeric(rtdx),
rsdx = as.numeric(rsdx),
gsdx =as.numeric(gsdx),
rtx = as.numeric(rtx)
)
#======================================================================
#=================== corract the variable type ========================
#======================================================================
# Now data are in the right format and ready to further analysis, but there are some variables needed to code as the number not character
# Before proforming cluster analysis which is the further analysis, I need to code the character to number
##### recode the previous crop
#if previosu crop data are rice, they will be coded as 1, but others, not rice, they will be coded as 0.
data$pc <- ifelse(data$pc == "rice", 1, 0)
Transplanting rice is 1, and direct seeded rice is 2.
#Crop establisment method
levels(data$cem)[levels(data$cem) == "trp"] <- 1
levels(data$cem)[levels(data$cem) == "TPR"] <- 1
levels(data$cem)[levels(data$cem) == "DSR"] <- 2
levels(data$cem)[levels(data$cem) == "dsr"] <- 2
fym there are two type 0 and 1, raw data are recorded as no, yes, and value, if the value is 0 which mean 0 and if the value more than 0 which means 1
data$fym <- ifelse(data$fym == "no", 0,
ifelse(data$fym == "0", 0, 1
)
)
Vartype there are three type treditional varieties coded as 1, modern varities coded as 2 and hybrid coded as 3.
data$vartype <- ifelse(data$vartype == "tv", 1,
ifelse(data$vartype == "mv", 2,
ifelse(data$vartype == "hyb", 3, NA
)
)
)
Weed management practices have three type, hand coded as 1, herb coded as 2, and herb-hand coded as 3.
# wcp weed control management
levels(data$wcp)[levels(data$wcp) == "hand"] <- 1
levels(data$wcp)[levels(data$wcp) == "herb"] <- 2
levels(data$wcp)[levels(data$wcp) == "herb-hand"] <- 3
Crop status have five level, very poor coded as 1, poor coded as 2, average coded as 3, good coded as 4 and very good coded as 5.
# Crop Status
levels(data$cs)[levels(data$cs) == "very poor"] <- 1
levels(data$cs)[levels(data$cs) == "poor"] <- 2
levels(data$cs)[levels(data$cs) == "average"] <- 3
levels(data$cs)[levels(data$cs) == "good"] <- 4
levels(data$cs)[levels(data$cs) == "very good"] <- 5
all data should be numeric data
#clean the data
num.data <- apply(data[, -c(1,2)], 2, as.numeric) # create dataframe to store the numerical transformation of raw data excluded fno and country
num.data <- as.data.frame(as.matrix(num.data)) # convert from vector to matrix
data <- cbind(data[ , c("fno", "country")], num.data)
data <- data[ , apply(data[, -c(1,2)], 2, var, na.rm = TRUE) != 0] # exclude the column with variation = 0
data <- data[complete.cases(data), ] # exclude row which cantain NA
m.data <- melt(data[, !names(data) %in% c("fno", "country")])
varnames <- colnames(data[, !names(data) %in% c("fno", "country")])
p <- list()
for(i in 1:length(varnames)) {
gdata <- m.data %>% filter(variable == varnames[i])
p[[i]] <- ggplot(gdata, aes(x = value)) +
geom_histogram(stat = "bin") + ggtitle(paste("Histogram of", varnames[i], sep = " "))
}
plot_grid(p[[1]], p[[2]], p[[3]], p[[4]], p[[5]], p[[6]], p[[7]], p[[8]], p[[9]], p[[10]],
p[[11]], p[[12]], p[[13]], p[[14]], p[[15]], p[[16]], p[[17]], p[[18]], p[[19]], p[[20]],
p[[21]], p[[22]], p[[23]], p[[24]], p[[25]], p[[26]], p[[27]], p[[28]], p[[29]], p[[30]],
p[[31]], p[[32]], p[[33]], p[[34]], p[[35]], p[[36]], p[[37]], p[[38]], p[[39]], p[[40]],
p[[41]], p[[42]], p[[43]], p[[44]], p[[45]], p[[46]], p[[47]], p[[48]], p[[49]], p[[50]],
p[[51]], p[[52]], p[[53]], p[[54]], ncol = 3, align = "v")
#head(data)
# select only the variables related to the injury profiles
start.IP <- "dhx" # set to read the data from column named "dhx"
end.IP <- "rtx" # set to read the data from column named "rtx"
start.col.IP <- match(start.IP, names(data)) # match function for check which column of the data mactch the column named following the conditons above
end.col.IP <- match(end.IP, names(data)) # match function for check which column of the data mactch the column named following the conditons above
IP.data <- data[start.col.IP:end.col.IP] # select the columns of raw data which are following the condition above
#==== pre-process before run correlaiton function
# because the correlaiton measures will not allow to perform with variables whihc have not variance.
IP.data <- IP.data[ ,apply(IP.data, 2, var, na.rm = TRUE) != 0] # exclude the column (variables) with variation = 0
country <- data$country #combine two cloumn names country and PS
IP.data <- cbind(country, IP.data)
IP.data[is.na(IP.data)] <- 0
name.country <- as.vector(unique(IP.data$country))
global netowork
global.results <- matrix(nrow = 0, ncol = 6)
for(b in 2:(dim(IP.data)[2]-1)){
#every species will be compared to every other species, so there has to be another loop that iterates down the rest of the columns
for(c in (b + 1):(dim(IP.data)[2])){
#summing the abundances of species of the columns that will be compared
species1.ab <- sum(IP.data[,b])
species2.ab <- sum(IP.data[,c])
#if the column is all 0's no co-occurrence will be performed
if(species1.ab > 1 & species2.ab > 1){
test <- cor.test(IP.data[,b], IP.data[,c], method = "spearman", na.action = na.rm, exact = FALSE)
# There are warnings when setting exact = TRUE because of ties from the output of Spearman's correlation
# stackoverflow.com/questions/10711395/spear-man-correlation and ties
# It would be still valid if the data is not normally distributed.
rho <- test$estimate
p.value <- test$p.value
}
if(species1.ab <=1 | species2.ab <= 1){
rho <- 0
p.value <- 1
}
new.row <- c( names(IP.data)[b], names(IP.data)[c], rho, p.value, species1.ab, species2.ab)
global.results <- rbind(global.results, new.row)
}
}
global.results <- data.frame(data.matrix(global.results))
## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276
## --> row.names NOT used
names(global.results) <- c("var1","var2","rho","p.value","ab1","ab2")
#making sure certain variables are factors
global.results$var1 <- as.character(as.factor(global.results$var1))
global.results$var2 <- as.character(as.factor(global.results$var2))
global.results$rho <- as.numeric(as.character(global.results$rho))
global.results$p.value <- as.numeric(as.character(global.results$p.value))
global.results$ab1 <- as.numeric(as.character(global.results$ab1))
global.results$ab2 <- as.numeric(as.character(global.results$ab2))
head(global.results)
## var1 var2 rho p.value ab1 ab2
## 1 dhx whx 0.13481056 0.006654425 10072 16947
## 2 dhx ssx 0.13906638 0.005107772 10072 2824
## 3 dhx defa 0.12487379 0.012005084 10072 4565
## 4 dhx bphx -0.09582190 0.054297060 10072 14160
## 5 dhx wbpx -0.08930876 0.072955895 10072 2076
## 6 dhx awx 0.04379802 0.379931219 10072 87
sub_global.results <- subset(global.results, as.numeric(as.character(abs(rho))) > 0.2)
gnet <- graph.edgelist(as.matrix(sub_global.results[, 1:2]), directed = FALSE)
#== adjust layout
l <- layout.fruchterman.reingold(gnet)
#== adjust vertices
V(gnet)$color <- "tomato"
V(gnet)$frame.color <- "gray40"
V(gnet)$shape <- "circle"
V(gnet)$size <- 25
V(gnet)$label.color <- "white"
V(gnet)$label.font <- 2
V(gnet)$label.family <- "Helvetica"
V(gnet)$label.cex <- 0.7
#== adjust the edge
E(gnet)$weight <- as.matrix(sub_global.results$rho)
E(gnet)$width <- 1 + E(gnet)$weight*5
col <- c("firebrick3", "forestgreen")
colc <- cut(sub_global.results$rho, breaks = c(-1, 0, 1), include.lowest = TRUE)
E(gnet)$color <- col[colc]
#== plot network model
plot(gnet, layout = l * 1.0, main = "single network model of injuries profiles")
#=====co_occurrence_pairwise.R====
country.results <- matrix(nrow = 0, ncol = 7) # create results to store the outputs
options(warnings = -1) # setting not to show the massages as the warnings
for(a in 1:length(name.country)){
# subset the raw data by groups of countries and cluster of production situation
country.temp <- name.country[a]
#subset the dataset for those treatments
temp <- subset(IP.data, country == country.temp)
#in this case the community data started at column 6, so the loop for co-occurrence has to start at that point
for(b in 2:(dim(temp)[2]-1)){
#every species will be compared to every other species, so there has to be another loop that iterates down the rest of the columns
for(c in (b + 1):(dim(temp)[2])){
#summing the abundances of species of the columns that will be compared
species1.ab <- sum(temp[,b])
species2.ab <- sum(temp[,c])
#if the column is all 0's no co-occurrence will be performed
if(species1.ab > 1 & species2.ab > 1){
test <- cor.test(temp[,b], temp[,c], method = "spearman", na.action = na.rm, exact = FALSE)
# There are warnings when setting exact = TRUE because of ties from the output of Spearman's correlation
# stackoverflow.com/questions/10711395/spear-man-correlation and ties
# It would be still valid if the data is not normally distributed.
rho <- test$estimate
p.value <- test$p.value
}
if(species1.ab <=1 | species2.ab <= 1){
rho <- 0
p.value <- 1
}
new.row <- c(name.country[a], names(temp)[b], names(temp)[c], rho, p.value, species1.ab, species2.ab)
country.results <- rbind(country.results, new.row)
}
}
print(a/length(name.country))
}
## [1] 0.2
## [1] 0.4
## [1] 0.6
## [1] 0.8
## [1] 1
country.results <- data.frame(data.matrix(country.results))
names(country.results) <- c("country","var1","var2","rho","p.value","ab1","ab2")
#making sure certain variables are factors
country.results$country <- as.factor(country.results$country)
country.results$taxa1 <- as.character(as.factor(country.results$var1))
country.results$taxa2 <- as.character(as.factor(country.results$var2))
country.results$rho <- as.numeric(as.character(country.results$rho))
country.results$p.value <- as.numeric(as.character(country.results$p.value))
country.results$ab1 <- as.numeric(as.character(country.results$ab1))
country.results$ab2 <- as.numeric(as.character(country.results$ab2))
head(country.results)
## country var1 var2 rho p.value ab1 ab2 taxa1 taxa2
## 1 PHL dhx whx 0.06524012 0.6891879742 1307 2089 dhx whx
## 2 PHL dhx ssx -0.12272044 0.4506097948 1307 109 dhx ssx
## 3 PHL dhx defa 0.18989475 0.2405423584 1307 1941 dhx defa
## 4 PHL dhx bphx -0.11699678 0.4721633481 1307 1286 dhx bphx
## 5 PHL dhx wbpx -0.50743562 0.0008316712 1307 84 dhx wbpx
## 6 PHL dhx awx 0.00000000 1.0000000000 1307 1 dhx awx
sub_country.results <- subset(country.results, as.numeric(as.character(abs(rho))) > 0.2)
results_sub.by.country <- list()
for(i in 1: length(name.country)){
results_sub.by.country[[i]] <- subset(sub_country.results, country == name.country[i])
}
### Network analysis
# head(results_sub.by.group[[1]][,2:3]) # get the list
#layout(matrix(c(1:14), 9, 2, byrow = TRUE))
cnet <- list()
for(i in 1:length(name.country)){
cnet[[i]] <- graph.edgelist(as.matrix(results_sub.by.country[[i]][, 2:3]), directed = FALSE)
#== adjust layout
l <- layout.circle(cnet[[i]])
#== adjust vertices
V(cnet[[i]])$color <- "tomato"
V(cnet[[i]])$frame.color <- "gray40"
V(cnet[[i]])$shape <- "circle"
V(cnet[[i]])$size <- 25
V(cnet[[i]])$label.color <- "white"
V(cnet[[i]])$label.font <- 2
V(cnet[[i]])$label.family <- "Helvetica"
V(cnet[[i]])$label.cex <- 0.7
#== adjust the edge
E(cnet[[i]])$weight <- as.matrix(results_sub.by.country[[i]][, 4])
E(cnet[[i]])$width <- 1 + E(cnet[[i]])$weight*5
col <- c("firebrick3", "forestgreen")
colc <- cut(results_sub.by.country[[i]]$rho, breaks = c(-1, 0, 1), include.lowest = TRUE)
E(cnet[[i]])$color <- col[colc]
#== plot network model
plot(cnet[[i]], layout = l * 1.0, main = paste( "network model of", name.country[i]))
}
network.value <- list()
for(i in 1:length(cnet)){
E(cnet[[i]])$weight <- abs(as.matrix(results_sub.by.country[[i]][, 4]))
network.value[[i]] <- data.frame(
country = name.country[i],
node = V(cnet[[i]])$name,
degree = degree(cnet[[i]]),
betweenness = betweenness(cnet[[i]]),
closeness = closeness(cnet[[i]]),
eigenvector = evcent(cnet[[i]])$vector,
clusterCoef = transitivity(cnet[[i]],type=c("local"))
)
network.value[[i]]$res = as.vector(lm(eigenvector ~ betweenness, data = network.value[[i]])$residuals)
}
network.value <- as.data.frame(do.call("rbind", network.value))
row.names(network.value ) <- NULL
# compute the topological properties
m.net.value <- melt(network.value[1:7])
## Using country, node as id variables
m.net.value %>% ggplot(aes(x= value, y = reorder(node, value))) +
geom_point(size = 3, aes(color = country)) +
theme_bw() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(color = "grey", linetype = 3)) +
ylab("Variables") +
facet_grid(.~variable, scale = "free")
## Warning: Removed 8 rows containing missing values (geom_point).
(Willocquet et al. 2004)
layout(matrix(c(1:9), 5, 2, byrow = TRUE))
## Warning in matrix(c(1:9), 5, 2, byrow = TRUE): data length [9] is not a
## sub-multiple or multiple of the number of rows [5]
for(i in 1:length(name.country)){
plot <- network.value %>%
filter(country == name.country[i]) %>%
ggplot(aes(x = betweenness, y = eigenvector,
label = node,
color = res,
size = abs(res))
) +
xlab("Betweenness Centrality") +
ylab("Eigencvector Centrality") +
geom_text() +
ggtitle(paste("Key Actor Analysis for Injuiry Profiles of", name.country[i]))
print(plot)
}
Wickham, Hadley. 2011. “The Split-Apply-Combine Strategy for Data Analysis.” Journal of Statistical Software 40 (1): 1–29. http://www.jstatsoft.org/v40/i01/.
Wickham, Hadley, and Romain Francois. 2015. Dplyr: A Grammar of Data Manipulation. http://CRAN.R-project.org/package=dplyr.
Wickham, and Hadley. 2007. “Reshaping Data with the Reshape Package.” Journal of Statistical Software 21 (12). http://www.jstatsoft.org/v21/i12/paper.
Willocquet, Laetitia, Francisco A Elazegui, Nancy Castilla, Luzviminda Fernandez, Kenneth S Fischer, ShaoBing Peng, Paul S Teng, et al. 2004. “Research Priorities for Rice Pest Management in Tropical Asia: A Simulation Analysis of Yield Losses and Management Efficiencies.” Phytopathology 94 (7). Am Phytopath Society: 672–82.